#################################################################
##DO SELF-REPORTING REGIMES MATTER? EVIDENCE FROM THE CAT########
## Cosette D. Creamer and Beth A. Simmons #######################
## February 2019 ###############################################
### Replication Script for IPTW MSM Analysis with LHRPS ##########
### ########################### ################################
#################################################################
#################################################################
#################################################################
## Clear environment
rm(list=ls())
sink("CAT LHRPS MSM Analysis.txt")
### Libraries
library(doParallel)
library(foreach)




### Simulation with random sample from simulated population
number_core<-detectCores()-1
cl<-makeCluster(number_core)
registerDoParallel(cl)

##################################################

foreach (alphavalue =seq(-0.17,0.17,0.01)) %dopar% {
  iterations <-20000
  mod<-seq(1,2, 1)
  for (m in mod){
    model <- m
    lead <- 2
    seed <- 1234
    alpha<-alphavalue  
    if (!is.na(seed)){
      set.seed(seed, kind="L'Ecuyer-CMRG")
    }
    
    
    
    ##################################################
    ####### Setting Up R     					  		  ########
    ##################################################
    
    ### Libraries
    library(data.table)
    library(MASS)
    library(car)
    library(SparseM)
    
    ##############################################
    ### Data loading and pre-processing
    ##############################################
    

    ## Load CAT data (full)
    sink("CAT LHRPS MSM Analysis.txt", append=TRUE)
    catfull <- read.csv("CATReporting_Final.csv", header=TRUE)
    
    
    ## Subset data to only include observations for CAT ratifiers, for observations > t+1 (t = year of ratification)
    cat <- subset(catfull, catfull$CATob==1)
    
    ### For estimates for Weighting Model 1 (Table A1, Model 1)
    if (model == 1){
      ## Remove observations with missingness for covariates used
      
      cat <- subset(cat, !is.na(cat$gdpl1) & !is.na(cat$poptot) &  !is.na(cat$farissl1) &
                      !is.na(cat$polityl1) & !is.na(cat$CATreview)  & !is.na(cat$nhril1) 
                    & !is.na(cat$CATart22l1)  & !is.na(cat$CATregiondensityl1) & !is.na(cat$nevdem) &
                      !is.na(cat$regionall1) & !is.na(cat$transdeml1) & 
                      !is.na(cat$transition2l1) &!is.na(cat$hrtpropl1))
      
      
      cat$CATregiondensityl1 <- 100*cat$CATregiondensityl1
      cat$HRTpercentl1 <- 100*cat$hrtpropl1
      cat$yr <-as.factor(cat$year)
      cat$lngdpl1 <- log(cat$gdpl1)
      cat$lnpop <- log(cat$poptot)
      
      ## Define a dataset with no missing obs for LHRS t+1 or t+2 (depending on argument)
      if (lead == 1){
        catuse <- subset(cat, !is.na(cat$farissf1))
        catuse$outcome <- catuse$farissf1
      }else if (lead == 2){
        catuse <- subset(cat, !is.na(cat$farissf2))
        catuse$outcome <- catuse$farissf2
      }else{
        stop("'lead' argument must be 1 or 2")
      }
      
      ##### Sort dataset by country- year
      catuse <- catuse[order(catuse$cowcode, catuse$year),]
      
#### For estimates for Weighting Model 3 (Table A1, Model 3)
    }else if (model == 2){
      
      cat <- subset(cat, !is.na(cat$gdpl1) & !is.na(cat$poptot) &  !is.na(cat$farissl1) &
                      !is.na(cat$polityl1) & !is.na(cat$CATreview)  & !is.na(cat$nhril1) 
                    & !is.na(cat$CATart22l1)  & !is.na(cat$CATregiondensityl1) & !is.na(cat$nevdem) &
                      !is.na(cat$regionall1) & !is.na(cat$transdeml1) & 
                      !is.na(cat$transition2l1) &!is.na(cat$hrtpropl1) & !is.na(cat$transparencyindex_l1))

      cat$CATregiondensityl1 <- 100*cat$CATregiondensityl1
      cat$HRTpercentl1 <- 100*cat$hrtpropl1
      cat$yr <-as.factor(cat$year)
      cat$lngdpl1 <- log(cat$gdpl1)
      cat$lnpop <- log(cat$poptot)
  
      
      ## Define a dataset with no missing obs for LHRS t+1 or t+2 (depending on argument)
      if (lead == 1){
        catuse <- subset(cat, !is.na(cat$farissf1))
        catuse$outcome <- catuse$farissf1
      }else if (lead == 2){
        catuse <- subset(cat, !is.na(cat$farissf2))
        catuse$outcome <- catuse$farissf2
      }else{
        stop("'lead' argument must be 1 or 2")
      }
      
      ##### Sort dataset by country- year
      catuse <- catuse[order(catuse$cowcode, catuse$year),]
    }else{
      stop("Model must be 1 or 2")
    }
    
    
    ###################################################
    ##### Main estimation/bootstrap routine ########
    ##### for Weighting Models 1 and 3 (LHRPS)########
    ###################################################
    if (model == 1){
      ## Weighting Model 1
      n_country <- length(unique(catuse$cowcode))
      
      ## Number of coefficients
      n_coef <- 5
      
      ## Placeholder for holding bootstrap coefficients
      boot_coefs <- as.data.frame(matrix(nrow=iterations, ncol=(n_coef+1)))
      
      # Start bootstrap loop - since we're concerned about sampling datasets that have non-convergence
      # we make this a while loop and use tryCatch when dealing with non-feasible datasets.
      
      # Initialize iteration number
      iter <- 1
      while(iter <= iterations){
        print(iter)
        # If it's not the first run, bootstrap
        if (iter != 1){
          # Resample dataset by country clusters
          boot.data <- data.frame()
          for (kit in 1:n_country){
            cnum <- sample(unique(catuse$cowcode), size=1) # Sample a country
            sampled.country <- catuse[catuse$cowcode == cnum,]
            sampled.country$uniqueID <- kit
            boot.data <- rbind(boot.data, sampled.country) # Append to the bootstrap dataset
          }
          # If it's the first run, use the raw data
        }else{
          boot.data <- catuse
          boot.data$uniqueID <- boot.data$cowcode
        }
        
        ### Get Coefficients - catch any errors
        MF1 <- tryCatch(
          {
            # Denominator model (YEAR + treatment(review) history variables + time-varying covariates)
            denom.fit <-  glm(CATreview ~ lngdpl1 + lnpop 
                              +nhril1 +polityl1+ farissl1 + HRTpercentl1 + CATart22l1 +
                                CATrevprevyr + CATyrssincereview + CATnumprevrevs+nevdem +
                                CATregiondensityl1+ regionall1+ transdeml1+transition2l1+yr,family="binomial", data=boot.data)
            denom.p <- predict(denom.fit, type = "response")
            
            
            # Numerator model (YEAR + treatment(review) history variables)
            numer.fit <-  glm(CATreview ~  CATrevprevyr + CATyrssincereview + CATnumprevrevs + 
                                nevdem + yr, family="binomial", data=boot.data)
            numer.p <- predict(numer.fit, type = "response")
            
            if (denom.fit$converged == 0 || numer.fit$converged == 0){
              stop("Probability models didn't converge")
            }else{
              ## Calculate weights
              boot.data$treat.prob <- ifelse(boot.data$CATreview == 1, denom.p, denom.p)
              boot.data$d.pr.tr <- ifelse(boot.data$CATreview == 1, denom.p, 1-denom.p)
              boot.data$n.pr.tr <- ifelse(boot.data$CATreview == 1, numer.p, 1-numer.p)
              boot.data$w <- boot.data$n.pr.tr/boot.data$d.pr.tr
              
              ## Bias-adjusted outcome - alpha*(1-1)*Pr(1) + alpha*(1-0)*Pr(0) for units receiving treatment and alpha*(0-1)*Pr(1) + alpha(0-0)*Pr(0) for units receiving control
              boot.data$time.bias <- ifelse(boot.data$CATreview == 1, alpha*(1-boot.data$treat.prob), -alpha*(boot.data$treat.prob))
              
              ## Observation weight is product over time
              boot.data.dt <- data.table(boot.data)
              # Product of weights
              boot.data.dt[ intersect(order(year), which(!is.na(w))), swrev := cumprod(w), by=uniqueID]
              # Bias at time period
              boot.data.dt[ intersect(order(year), which(!is.na(time.bias))), s.time.bias := cumsum(time.bias), by=uniqueID]
              boot.data <- as.data.frame(boot.data.dt)
              
              
              ###################################################
              ##### Sensitivity analysis transformations
              ###################################################
              if (alpha !=0){
                boot.data$outcomeAdj <- boot.data$outcome - boot.data$s.time.bias
              }else{
                boot.data$outcomeAdj <- boot.data$outcome
              }
              
              ########
              ## Estimate MSM
              lm(outcomeAdj ~ CATreview + CATrevprevyr+ CATyrssincereview+ 
                   CATnumprevrevs+ CATlintt, weights = boot.data$swrev, 
                 data = boot.data)
            }
          }, error = function(cond) {
            
            return("fail")
          }
        )
        
        ## If no error was caught
        if (class(MF1) == "lm"){
          ### Save the coefficients
          boot_coefs[iter,] <- MF1$coefficients
          if (iter == 1){
            colnames(boot_coefs) <- names(MF1$coefficients)
          }
          
          ## Increment iterations by 1
          iter <- iter + 1
        }else{
          print("Error detected! Re-sampling")
        }
      }
      
  
    }else if (model == 2){ 
      ## Calculate number of country clusters
      n_country <- length(unique(catuse$cowcode))
      
      ## Number of coefficients
      n_coef <- 5
      
      ## Placeholder for holding bootstrap coefficients
      boot_coefs <- as.data.frame(matrix(nrow=iterations, ncol=(n_coef+1)))
      
      # Start bootstrap loop - since we're concerned about sampling datasets that have non-convergence
      # we make this a while loop and use tryCatch when dealing with non-feasible datasets.
      
      # Initialize iteration number
      iter <- 1
      while(iter <= iterations){
        print(iter)
        # If it's not the first run, bootstrap
        if (iter != 1){
          # Resample dataset by country clusters
          boot.data <- data.frame()
          for (kit in 1:n_country){
            cnum <- sample(unique(catuse$cowcode), size=1) # Sample a country
            sampled.country <- catuse[catuse$cowcode == cnum,]
            sampled.country$uniqueID <- kit
            boot.data <- rbind(boot.data, sampled.country) # Append to the bootstrap dataset
          }
          # If it's the first run, use the raw data
        }else{
          boot.data <- catuse
          boot.data$uniqueID <- boot.data$cowcode
        }
        
        ### Get Coefficients - catch any errors
        MF1 <- tryCatch(
          {
            # Denominator model (YEAR + treatment(review) history variables + time-varying covariates)
            denom.fit <-  glm(CATreview ~ lngdpl1 + lnpop 
                              +nhril1 +polityl1+ farissl1 + HRTpercentl1 + CATart22l1
                              +   CATrevprevyr + CATyrssincereview + CATnumprevrevs+nevdem +
                                CATregiondensityl1+regionall1+transdeml1+transition2l1+transparencyindex_l1+yr,family="binomial", data=boot.data)
            denom.p <- predict(denom.fit, type = "response")
            
            # Numerator model (YEAR + treatment(review) history variables)
            numer.fit <-  glm(CATreview ~  CATrevprevyr + CATyrssincereview + CATnumprevrevs + 
                                nevdem + yr, family="binomial", data=boot.data)
            numer.p <- predict(numer.fit, type = "response")
            if (denom.fit$converged == 0 || numer.fit$converged == 0){
              stop("Probability models didn't converge")
            }else{
              ## Calculate weights
              boot.data$treat.prob <- ifelse(boot.data$CATreview == 1, denom.p, denom.p)
              boot.data$d.pr.tr <- ifelse(boot.data$CATreview == 1, denom.p, 1-denom.p)
              boot.data$n.pr.tr <- ifelse(boot.data$CATreview == 1, numer.p, 1-numer.p)
              boot.data$w <- boot.data$n.pr.tr/boot.data$d.pr.tr
              
              ## Bias-adjusted outcome - alpha*(1-1)*Pr(1) + alpha*(1-0)*Pr(0) for units receiving treatment and alpha*(0-1)*Pr(1) + alpha(0-0)*Pr(0) for units receiving control
              boot.data$time.bias <- ifelse(boot.data$CATreview == 1, alpha*(1-boot.data$treat.prob), -alpha*(boot.data$treat.prob))
              
              ## Observation weight is product over time
              boot.data.dt <- data.table(boot.data)
              # Product of weights
              boot.data.dt[ intersect(order(year), which(!is.na(w))), swrev := cumprod(w), by=uniqueID]
              # Bias at time period
              boot.data.dt[ intersect(order(year), which(!is.na(time.bias))), s.time.bias := cumsum(time.bias), by=uniqueID]
              boot.data <- as.data.frame(boot.data.dt)
              
              
              ###################################################
              ##### Sensitivity analysis transformations
              ###################################################
              if (alpha !=0){
                boot.data$outcomeAdj <- boot.data$outcome - boot.data$s.time.bias
              }else{
                boot.data$outcomeAdj <- boot.data$outcome
              }
              
              ########
              ## Estimate MSM
              lm(outcomeAdj ~ CATreview + CATrevprevyr+ CATyrssincereview+ 
                   CATnumprevrevs+ CATlintt, weights = boot.data$swrev, 
                 data = boot.data)
            }
          }, error = function(cond) {
            
            return("fail")
          }
        )
        
        ## If no error was caught
        if (class(MF1) == "lm"){
          ### Save the coefficients
          boot_coefs[iter,] <- MF1$coefficients
          if (iter == 1){
            colnames(boot_coefs) <- names(MF1$coefficients)
          }
          
          ## Increment iterations by 1
          iter <- iter + 1
        }else{
          print("Error detected! Re-sampling")
        }
      }
      
    }
    
    
    ### Merge metadata with bootstrap iterations
    boot_coefs$model <- model
    boot_coefs$alpha <- alpha
    boot_coefs$iterations <- iterations
    ## Save results
    filen <- paste("CAT_LHRPS__model_",model,"_alpha_", alpha, "_iterations_", iterations, ".csv", sep="")
    write.csv(boot_coefs, filen)
    sink(NULL)
    
  } 
}  

sink(NULL)